Learning objective
Princples
Training dataset => overfitting if 8 degree of polynomials
Training dataset => also shows overfitting
Testing dataset => 8 degree of polynomials fit poorly
Algebraic and graphical demonstration of trade-off between bias and variance
Prostate <- read.csv("Data/prostate.csv")
# Divide into training and testing sets; Randomly select 20 rows to be testing dataset
set.seed(23456)
TestingIndex <- sample(1:nrow(Prostate), 20)
Training <- Prostate[-TestingIndex, ]
Testing <- Prostate[TestingIndex, ]lm(y~x)lm(y~.) A period “.” means all other variables
in the data set, a minus “–” excludes a variable (more convenient)step(function, direction="backward")## Backward selection (start from the most complicated model first)
Full <- lm(lpsa ~ lcavol + lweight + age + lbph + svi + lcp + gleason + pgg45,
data = Training)
Backward1 <- step(Full, direction = "backward")
# Alternative for backward selection
Alt <- lm(lpsa ~ . - age - svi, data = Training) #age and svi are removed in this case
Backward2 <- step(Full, direction = "backward", test = "F")
Backward3 <- step(Full, direction = "backward", test = "Chisq")
## Forward selection
Null <- lm(lpsa ~ 1, data = Training)
Forward <- step(Null, scope = list(lower = Null, upper = Full),
direction = "forward")
## Stepwise selection
Stepwise1 <- step(Full, direction = "both")
Stepwise2 <- step(Null, scope = list(lower = Null, upper = Full),
direction = "both")## Start: AIC=-39.31
## lpsa ~ lcavol + lweight + age + lbph + svi + lcp + gleason +
## pgg45
##
## Df Sum of Sq RSS AIC
## - gleason 1 0.1810 36.760 -40.933
## - lcp 1 0.2693 36.849 -40.748
## - pgg45 1 0.3331 36.912 -40.615
## <none> 36.579 -39.313
## - age 1 1.1468 37.726 -38.936
## - lbph 1 2.3121 38.891 -36.594
## - svi 1 3.8819 40.461 -33.547
## - lweight 1 3.9289 40.508 -33.457
## - lcavol 1 20.3290 56.908 -7.282
##
## Step: AIC=-40.93
## lpsa ~ lcavol + lweight + age + lbph + svi + lcp + pgg45
##
## Df Sum of Sq RSS AIC
## - lcp 1 0.3002 37.060 -42.307
## <none> 36.760 -40.933
## - age 1 1.1118 37.872 -40.638
## - pgg45 1 1.4277 38.188 -39.999
## - lbph 1 2.2925 39.053 -38.275
## - lweight 1 3.7861 40.546 -35.385
## - svi 1 3.8174 40.578 -35.325
## - lcavol 1 21.7089 58.469 -7.198
##
## Step: AIC=-42.31
## lpsa ~ lcavol + lweight + age + lbph + svi + pgg45
##
## Df Sum of Sq RSS AIC
## - age 1 0.9174 37.978 -42.424
## <none> 37.060 -42.307
## - pgg45 1 1.1290 38.190 -41.996
## - lbph 1 2.2019 39.262 -39.863
## - svi 1 3.6525 40.713 -37.069
## - lweight 1 3.8516 40.912 -36.693
## - lcavol 1 25.1255 62.186 -4.453
##
## Step: AIC=-42.42
## lpsa ~ lcavol + lweight + lbph + svi + pgg45
##
## Df Sum of Sq RSS AIC
## - pgg45 1 0.8161 38.794 -42.787
## <none> 37.978 -42.424
## - lbph 1 1.6290 39.607 -41.190
## - lweight 1 3.3245 41.302 -37.962
## - svi 1 3.9171 41.895 -36.865
## - lcavol 1 24.4609 62.439 -6.141
##
## Step: AIC=-42.79
## lpsa ~ lcavol + lweight + lbph + svi
##
## Df Sum of Sq RSS AIC
## <none> 38.794 -42.787
## - lbph 1 2.0087 40.803 -40.900
## - lweight 1 2.9786 41.773 -39.091
## - svi 1 5.8688 44.663 -33.939
## - lcavol 1 27.6768 66.471 -3.322
## Start: AIC=-39.31
## lpsa ~ lcavol + lweight + age + lbph + svi + lcp + gleason +
## pgg45
##
## Df Sum of Sq RSS AIC F value Pr(>F)
## - gleason 1 0.1810 36.760 -40.933 0.3366 0.563740
## - lcp 1 0.2693 36.849 -40.748 0.5007 0.481611
## - pgg45 1 0.3331 36.912 -40.615 0.6192 0.434086
## <none> 36.579 -39.313
## - age 1 1.1468 37.726 -38.936 2.1320 0.148860
## - lbph 1 2.3121 38.891 -36.594 4.2982 0.041944 *
## - svi 1 3.8819 40.461 -33.547 7.2163 0.009072 **
## - lweight 1 3.9289 40.508 -33.457 7.3038 0.008682 **
## - lcavol 1 20.3290 56.908 -7.282 37.7911 4.689e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=-40.93
## lpsa ~ lcavol + lweight + age + lbph + svi + lcp + pgg45
##
## Df Sum of Sq RSS AIC F value Pr(>F)
## - lcp 1 0.3002 37.060 -42.307 0.5635 0.455425
## <none> 36.760 -40.933
## - age 1 1.1118 37.872 -40.638 2.0869 0.153094
## - pgg45 1 1.4277 38.188 -39.999 2.6799 0.106175
## - lbph 1 2.2925 39.053 -38.275 4.3031 0.041776 *
## - lweight 1 3.7861 40.546 -35.385 7.1066 0.009558 **
## - svi 1 3.8174 40.578 -35.325 7.1653 0.009279 **
## - lcavol 1 21.7089 58.469 -7.198 40.7482 1.709e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=-42.31
## lpsa ~ lcavol + lweight + age + lbph + svi + pgg45
##
## Df Sum of Sq RSS AIC F value Pr(>F)
## - age 1 0.9174 37.978 -42.424 1.7327 0.192360
## <none> 37.060 -42.307
## - pgg45 1 1.1290 38.190 -41.996 2.1325 0.148675
## - lbph 1 2.2019 39.262 -39.863 4.1589 0.045193 *
## - svi 1 3.6525 40.713 -37.069 6.8990 0.010587 *
## - lweight 1 3.8516 40.912 -36.693 7.2750 0.008754 **
## - lcavol 1 25.1255 62.186 -4.453 47.4572 1.99e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=-42.42
## lpsa ~ lcavol + lweight + lbph + svi + pgg45
##
## Df Sum of Sq RSS AIC F value Pr(>F)
## - pgg45 1 0.8161 38.794 -42.787 1.5257 0.220827
## <none> 37.978 -42.424
## - lbph 1 1.6290 39.607 -41.190 3.0454 0.085294 .
## - lweight 1 3.3245 41.302 -37.962 6.2153 0.014999 *
## - svi 1 3.9171 41.895 -36.865 7.3230 0.008519 **
## - lcavol 1 24.4609 62.439 -6.141 45.7300 3.193e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=-42.79
## lpsa ~ lcavol + lweight + lbph + svi
##
## Df Sum of Sq RSS AIC F value Pr(>F)
## <none> 38.794 -42.787
## - lbph 1 2.0087 40.803 -40.900 3.7280 0.057445 .
## - lweight 1 2.9786 41.773 -39.091 5.5281 0.021454 *
## - svi 1 5.8688 44.663 -33.939 10.8922 0.001504 **
## - lcavol 1 27.6768 66.471 -3.322 51.3670 5.432e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Start: AIC=-39.31
## lpsa ~ lcavol + lweight + age + lbph + svi + lcp + gleason +
## pgg45
##
## Df Sum of Sq RSS AIC Pr(>Chi)
## - gleason 1 0.1810 36.760 -40.933 0.537516
## - lcp 1 0.2693 36.849 -40.748 0.452296
## - pgg45 1 0.3331 36.912 -40.615 0.403471
## <none> 36.579 -39.313
## - age 1 1.1468 37.726 -38.936 0.123129
## - lbph 1 2.3121 38.891 -36.594 0.029823 *
## - svi 1 3.8819 40.461 -33.547 0.005323 **
## - lweight 1 3.9289 40.508 -33.457 0.005066 **
## - lcavol 1 20.3290 56.908 -7.282 5.425e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=-40.93
## lpsa ~ lcavol + lweight + age + lbph + svi + lcp + pgg45
##
## Df Sum of Sq RSS AIC Pr(>Chi)
## - lcp 1 0.3002 37.060 -42.307 0.428742
## <none> 36.760 -40.933
## - age 1 1.1118 37.872 -40.638 0.129847
## - pgg45 1 1.4277 38.188 -39.999 0.086733 .
## - lbph 1 2.2925 39.053 -38.275 0.030906 *
## - lweight 1 3.7861 40.546 -35.385 0.006007 **
## - svi 1 3.8174 40.578 -35.325 0.005812 **
## - lcavol 1 21.7089 58.469 -7.198 2.261e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=-42.31
## lpsa ~ lcavol + lweight + age + lbph + svi + pgg45
##
## Df Sum of Sq RSS AIC Pr(>Chi)
## - age 1 0.9174 37.978 -42.424 0.170020
## <none> 37.060 -42.307
## - pgg45 1 1.1290 38.190 -41.996 0.128480
## - lbph 1 2.2019 39.262 -39.863 0.035023 *
## - svi 1 3.6525 40.713 -37.069 0.007139 **
## - lweight 1 3.8516 40.912 -36.693 0.005794 **
## - lcavol 1 25.1255 62.186 -4.453 2.737e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=-42.42
## lpsa ~ lcavol + lweight + lbph + svi + pgg45
##
## Df Sum of Sq RSS AIC Pr(>Chi)
## - pgg45 1 0.8161 38.794 -42.787 0.200720
## <none> 37.978 -42.424
## - lbph 1 1.6290 39.607 -41.190 0.072130 .
## - lweight 1 3.3245 41.302 -37.962 0.011023 *
## - svi 1 3.9171 41.895 -36.865 0.005973 **
## - lcavol 1 24.4609 62.439 -6.141 6.119e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=-42.79
## lpsa ~ lcavol + lweight + lbph + svi
##
## Df Sum of Sq RSS AIC Pr(>Chi)
## <none> 38.794 -42.787
## - lbph 1 2.0087 40.803 -40.900 0.0486580 *
## - lweight 1 2.9786 41.773 -39.091 0.0170030 *
## - svi 1 5.8688 44.663 -33.939 0.0009893 ***
## - lcavol 1 27.6768 66.471 -3.322 1.2e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Start: AIC=32.5
## lpsa ~ 1
##
## Df Sum of Sq RSS AIC
## + lcavol 1 61.297 53.121 -24.585
## + lcp 1 35.552 78.865 5.843
## + svi 1 35.144 79.273 6.241
## + pgg45 1 20.457 93.961 19.328
## + gleason 1 16.072 98.346 22.841
## + lweight 1 12.519 101.899 25.574
## + lbph 1 5.991 108.426 30.354
## <none> 114.418 32.496
## + age 1 2.739 111.679 32.630
##
## Step: AIC=-24.59
## lpsa ~ lcavol
##
## Df Sum of Sq RSS AIC
## + lweight 1 6.8011 46.320 -33.134
## + lbph 1 5.4723 47.648 -30.957
## + svi 1 5.1858 47.935 -30.495
## + pgg45 1 2.6036 50.517 -26.455
## <none> 53.121 -24.585
## + gleason 1 1.2538 51.867 -24.425
## + lcp 1 1.2484 51.872 -24.417
## + age 1 0.1665 52.954 -22.827
##
## Step: AIC=-33.13
## lpsa ~ lcavol + lweight
##
## Df Sum of Sq RSS AIC
## + svi 1 5.5170 40.803 -40.900
## + pgg45 1 3.2417 43.078 -36.721
## + gleason 1 2.1983 44.121 -34.878
## + lcp 1 1.8802 44.439 -34.325
## + lbph 1 1.6569 44.663 -33.939
## <none> 46.320 -33.134
## + age 1 0.1505 46.169 -31.385
##
## Step: AIC=-40.9
## lpsa ~ lcavol + lweight + svi
##
## Df Sum of Sq RSS AIC
## + lbph 1 2.00867 38.794 -42.787
## + pgg45 1 1.19581 39.607 -41.190
## + gleason 1 1.13789 39.665 -41.077
## <none> 40.803 -40.900
## + age 1 0.11618 40.686 -39.119
## + lcp 1 0.01958 40.783 -38.937
##
## Step: AIC=-42.79
## lpsa ~ lcavol + lweight + svi + lbph
##
## Df Sum of Sq RSS AIC
## <none> 38.794 -42.787
## + gleason 1 0.86851 37.925 -42.530
## + pgg45 1 0.81611 37.978 -42.424
## + age 1 0.60442 38.190 -41.996
## + lcp 1 0.00350 38.790 -40.794
## Start: AIC=-39.31
## lpsa ~ lcavol + lweight + age + lbph + svi + lcp + gleason +
## pgg45
##
## Df Sum of Sq RSS AIC
## - gleason 1 0.1810 36.760 -40.933
## - lcp 1 0.2693 36.849 -40.748
## - pgg45 1 0.3331 36.912 -40.615
## <none> 36.579 -39.313
## - age 1 1.1468 37.726 -38.936
## - lbph 1 2.3121 38.891 -36.594
## - svi 1 3.8819 40.461 -33.547
## - lweight 1 3.9289 40.508 -33.457
## - lcavol 1 20.3290 56.908 -7.282
##
## Step: AIC=-40.93
## lpsa ~ lcavol + lweight + age + lbph + svi + lcp + pgg45
##
## Df Sum of Sq RSS AIC
## - lcp 1 0.3002 37.060 -42.307
## <none> 36.760 -40.933
## - age 1 1.1118 37.872 -40.638
## - pgg45 1 1.4277 38.188 -39.999
## + gleason 1 0.1810 36.579 -39.313
## - lbph 1 2.2925 39.053 -38.275
## - lweight 1 3.7861 40.546 -35.385
## - svi 1 3.8174 40.578 -35.325
## - lcavol 1 21.7089 58.469 -7.198
##
## Step: AIC=-42.31
## lpsa ~ lcavol + lweight + age + lbph + svi + pgg45
##
## Df Sum of Sq RSS AIC
## - age 1 0.9174 37.978 -42.424
## <none> 37.060 -42.307
## - pgg45 1 1.1290 38.190 -41.996
## + lcp 1 0.3002 36.760 -40.933
## + gleason 1 0.2119 36.849 -40.748
## - lbph 1 2.2019 39.262 -39.863
## - svi 1 3.6525 40.713 -37.069
## - lweight 1 3.8516 40.912 -36.693
## - lcavol 1 25.1255 62.186 -4.453
##
## Step: AIC=-42.42
## lpsa ~ lcavol + lweight + lbph + svi + pgg45
##
## Df Sum of Sq RSS AIC
## - pgg45 1 0.8161 38.794 -42.787
## <none> 37.978 -42.424
## + age 1 0.9174 37.060 -42.307
## - lbph 1 1.6290 39.607 -41.190
## + gleason 1 0.1646 37.813 -40.758
## + lcp 1 0.1057 37.872 -40.638
## - lweight 1 3.3245 41.302 -37.962
## - svi 1 3.9171 41.895 -36.865
## - lcavol 1 24.4609 62.439 -6.141
##
## Step: AIC=-42.79
## lpsa ~ lcavol + lweight + lbph + svi
##
## Df Sum of Sq RSS AIC
## <none> 38.794 -42.787
## + gleason 1 0.8685 37.925 -42.530
## + pgg45 1 0.8161 37.978 -42.424
## + age 1 0.6044 38.190 -41.996
## - lbph 1 2.0087 40.803 -40.900
## + lcp 1 0.0035 38.790 -40.794
## - lweight 1 2.9786 41.773 -39.091
## - svi 1 5.8688 44.663 -33.939
## - lcavol 1 27.6768 66.471 -3.322
## Start: AIC=32.5
## lpsa ~ 1
##
## Df Sum of Sq RSS AIC
## + lcavol 1 61.297 53.121 -24.585
## + lcp 1 35.552 78.865 5.843
## + svi 1 35.144 79.273 6.241
## + pgg45 1 20.457 93.961 19.328
## + gleason 1 16.072 98.346 22.841
## + lweight 1 12.519 101.899 25.574
## + lbph 1 5.991 108.426 30.354
## <none> 114.418 32.496
## + age 1 2.739 111.679 32.630
##
## Step: AIC=-24.59
## lpsa ~ lcavol
##
## Df Sum of Sq RSS AIC
## + lweight 1 6.801 46.320 -33.134
## + lbph 1 5.472 47.648 -30.957
## + svi 1 5.186 47.935 -30.495
## + pgg45 1 2.604 50.517 -26.455
## <none> 53.121 -24.585
## + gleason 1 1.254 51.867 -24.425
## + lcp 1 1.248 51.872 -24.417
## + age 1 0.166 52.954 -22.827
## - lcavol 1 61.297 114.418 32.496
##
## Step: AIC=-33.13
## lpsa ~ lcavol + lweight
##
## Df Sum of Sq RSS AIC
## + svi 1 5.517 40.803 -40.900
## + pgg45 1 3.242 43.078 -36.721
## + gleason 1 2.198 44.121 -34.878
## + lcp 1 1.880 44.439 -34.325
## + lbph 1 1.657 44.663 -33.939
## <none> 46.320 -33.134
## + age 1 0.150 46.169 -31.385
## - lweight 1 6.801 53.121 -24.585
## - lcavol 1 55.579 101.899 25.574
##
## Step: AIC=-40.9
## lpsa ~ lcavol + lweight + svi
##
## Df Sum of Sq RSS AIC
## + lbph 1 2.0087 38.794 -42.787
## + pgg45 1 1.1958 39.607 -41.190
## + gleason 1 1.1379 39.665 -41.077
## <none> 40.803 -40.900
## + age 1 0.1162 40.686 -39.119
## + lcp 1 0.0196 40.783 -38.937
## - svi 1 5.5170 46.320 -33.134
## - lweight 1 7.1323 47.935 -30.495
## - lcavol 1 27.4874 68.290 -3.243
##
## Step: AIC=-42.79
## lpsa ~ lcavol + lweight + svi + lbph
##
## Df Sum of Sq RSS AIC
## <none> 38.794 -42.787
## + gleason 1 0.8685 37.925 -42.530
## + pgg45 1 0.8161 37.978 -42.424
## + age 1 0.6044 38.190 -41.996
## - lbph 1 2.0087 40.803 -40.900
## + lcp 1 0.0035 38.790 -40.794
## - lweight 1 2.9786 41.773 -39.091
## - svi 1 5.8688 44.663 -33.939
## - lcavol 1 27.6768 66.471 -3.322
OLS
Penalized regression
We can add constraint to OLS (adding some restriction terms can make the model less complicated => generate some bias but lower variance)
Add constraint term
As mentioned in last part, “t” is the penalty parameter. We can transform t to λ and use it as constraint term. We add a constraint term then directly minimize the whole function.
Ridge vs LASSO resgression
Ridge regression
LASSO = Least Absolute Shrinkage and Selection Operator
Determination of λ
The determination of λ is based on k-fold cross-validation
Note: if k is set as sample size, it is called leave-one-out validation.
Package: glmnet
glmnet(x, y, alpha, lambda, family…) to fit regression
cv.glmnet(x, y, alpha) to determine λ for Ridge
regressionlibrary(glmnet)
## 1a Specifying λ
x <- model.matrix(lpsa ~ . - 1, data = Training)#glmnet alr include intercept so remove it in x matrix
y <- Training$lpsa
Ridge1 <- glmnet(x, y, alpha = 0, lambda = c(0.5, 1))
Ridge1
coef(Ridge1)Note the sequence of lambda
## 1b Not specifying λ
Ridge2 <- glmnet(x, y, alpha = 0)
#plot(Ridge2, xvar="lambda", label=TRUE)
#plot(Ridge2, xvar="dev", label=TRUE)## 1c Cross-validation to determine λ
set.seed(56789)
Ridge2.cv <- cv.glmnet(x, y, alpha = 0)
#plot(Ridge2.cv)
#Mean MSE (red dot) ± SE (vertical bar)
#Left vertical line: minimum MSE
#Right vertical line: maximum value of λ within 1 SE of the minimum (1-SE rule)
min(Ridge2.cv$cvm)#Minimum MSE
log(Ridge2.cv$lambda.min) #λ to obtain Minimum MSE
log(Ridge2.cv$lambda.1se) #λ selected under 1-SE rule
## 2 Obtain new beta coefficients by 1-SE rule
coef(Ridge2, s = Ridge2.cv$lambda.1se) #supply the λ by specifying "s"
## 3 Predicted values on testing set
x.test <- model.matrix(lpsa ~ . - 1, data = Testing)
Ridge2.Pred <- predict(Ridge2, newx = x.test, s = Ridge2.cv$lambda.1se)## [1] 0.615571
## [1] -2.416623
## [1] -0.3698812
## 9 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) -0.234804862
## lcavol 0.328430341
## lweight 0.387430007
## age -0.002396460
## lbph 0.085566483
## svi 0.500485901
## lcp 0.091304642
## gleason 0.125769781
## pgg45 0.003117404
## 1a Specifying λ
LASSO1 <- glmnet(x, y, alpha = 1, lambda = c(0.25, 0.5, 0.75, 1)) #"." = dropped from the model; Highest penalty results in lowest no. of predictors
## 1b Not specifying λ
LASSO2 <- glmnet(x, y, alpha = 1)
#plot(LASSO2, xvar="lambda", label=TRUE)
#plot(LASSO2, xvar="dev", label=TRUE)## 1c Cross-validation to determine λ
set.seed(54321)
LASSO2.cv <- cv.glmnet(x, y, alpha = 1)
#plot(LASSO2.cv)
#Mean MSE (red dot) ± SE (vertical bar)
#Left vertical line: minimum MSE
#Right vertical line: maximum value of λ within 1 SE of the minimum (1-SE rule)
min(LASSO2.cv$cvm)#Minimum MSE
log(LASSO2.cv$lambda.min) #λ to obtain Minimum MSE
log(LASSO2.cv$lambda.1se) #λ selected under 1-SE rule
## 2 Obtain new beta coefficients by 1-SE rule
coef(LASSO2, s = LASSO2.cv$lambda.1se) #supply the λ by specifying "s"
## 3 Predicted values on testing set
LASSO2.Pred <- predict(LASSO2, newx = x.test, s = LASSO2.cv$lambda.1se)## [1] 0.6172947
## [1] -3.370219
## [1] -1.416511
## 9 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 1.1283118
## lcavol 0.4694206
## lweight 0.1733541
## age .
## lbph .
## svi 0.3438657
## lcp .
## gleason .
## pgg45 .
If predictor variables are correlated => results
in large variance and make estimation unstable
Compared to OLS (conventional lr), penalized regression give robust
estimate => estimate not too sensitive to some
changes/errors/outliers
Prostate.col <- Prostate
## create a new variable by adding two variables in the data set (perfect linerality) => fit linear regression
Prostate.col$new1 <- Prostate.col$lcavol + Prostate.col$lweight
lm(formula = lpsa ~ lcavol + lweight + new1, data = Prostate.col)
## add some noise to the new variable => fit linear regression
set.seed(234567)
Prostate.col$new2 <- Prostate.col$new1 + rnorm(nrow(Prostate.col), 0, 0.1)
lm(formula = lpsa ~ lcavol + lweight + new2, data = Prostate.col)
#LR results in unstable estimate for new variable (from NA to 1.2719) if hv collinearity issue
#OLS is sensitive to multicollinearity##
## Call:
## lm(formula = lpsa ~ lcavol + lweight + new1, data = Prostate.col)
##
## Coefficients:
## (Intercept) lcavol lweight new1
## -0.3026 0.6775 0.5109 NA
##
##
## Call:
## lm(formula = lpsa ~ lcavol + lweight + new2, data = Prostate.col)
##
## Coefficients:
## (Intercept) lcavol lweight new2
## -0.2495 -0.5894 -0.7710 1.2719
Comparison of estimates from ridge regression:
Similar/robustness of estimate => not too sensitive to some
changes/errors/outliers
We usually assume sample size (n) is greater than no. of predictor
(p)
However, in analysis like genomic study, 10000 gene predictor (p) is
used to explore risk of rare disease with low incidence (n), where
p>n
Ridge and LASSO regression can give estimates even when n<p
## Create a data set with sample size = 6
set.seed(21098)
SmallIndex <- sample(1:nrow(Prostate), 6)
Small <- Prostate[SmallIndex, ]
## 1 OLS -- only get 6 estimate coz cannot hv estimate > sample size
lm(formula = lpsa ~ lcavol + lweight + age + lbph + svi + lcp + gleason + pgg45, data = Small)
## 2 Ridge and LASSO -- still can give estimate
x.Small <- model.matrix(lpsa ~ lcavol + lweight + age + lbph + svi + lcp +
gleason + pgg45 - 1, data = Small)
y.Small <- Small$lpsa
Small.Ridge <- glmnet(x.Small, y.Small, alpha = 0, lambda = c(0.1, 0.01))
coef(Small.Ridge)
Small.LASSO <- glmnet(x.Small, y.Small, alpha = 1, lambda = c(0.1, 0.01))
coef(Small.LASSO)##
## Call:
## lm(formula = lpsa ~ lcavol + lweight + age + lbph + svi + lcp +
## gleason + pgg45, data = Small)
##
## Coefficients:
## (Intercept) lcavol lweight age lbph svi
## -5.67764 0.69457 0.70011 0.07270 -0.03293 NA
## lcp gleason pgg45
## 0.15074 NA NA
##
## 9 x 2 sparse Matrix of class "dgCMatrix"
## s0 s1
## (Intercept) -3.983189980 -3.34701545
## lcavol 0.634838300 0.71370283
## lweight 0.377882039 0.39886193
## age 0.052767193 0.04535218
## lbph 0.043633242 0.03597348
## svi . .
## lcp 0.337259634 0.42433255
## gleason 0.156275254 0.12605415
## pgg45 0.005244347 0.00601659
## 9 x 2 sparse Matrix of class "dgCMatrix"
## s0 s1
## (Intercept) -4.61897188 -5.5170679541
## lcavol 0.62848146 0.6821712429
## lweight 0.45130177 0.5797324637
## age 0.06858169 0.0746291622
## lbph . .
## svi . .
## lcp . 0.0129012020
## gleason . .
## pgg45 . 0.0005385419
Elastic net is the sum of ridge and LASSO estimates with weighting
between these two penalty functions.
α: 0 = Ridge regression, 1 = LASSO regression, 0 < α < 1 = Elastic
net
x <- model.matrix(lpsa ~ . - 1, data = Training)
y <- Training$lpsa
x.test <- model.matrix(lpsa ~ . - 1, data = Testing)
library(glmnet)
Elastic000 <- glmnet(x, y, alpha = 0)
Elastic005 <- glmnet(x, y, alpha = 0.05)
Elastic010 <- glmnet(x, y, alpha = 0.1)
Elastic020 <- glmnet(x, y, alpha = 0.2)
Elastic050 <- glmnet(x, y, alpha = 0.5)
Elastic080 <- glmnet(x, y, alpha = 0.8)
Elastic090 <- glmnet(x, y, alpha = 0.9)
Elastic095 <- glmnet(x, y, alpha = 0.95)
Elastic100 <- glmnet(x, y, alpha = 1)
par(mfrow=c(3,3), mar=c(5,4,2,1))
plot(Elastic000, xvar="lambda", label=TRUE)
plot(Elastic005, xvar="lambda", label=TRUE)
plot(Elastic010, xvar="lambda", label=TRUE)
plot(Elastic020, xvar="lambda", label=TRUE)
plot(Elastic050, xvar="lambda", label=TRUE)
plot(Elastic080, xvar="lambda", label=TRUE)
plot(Elastic090, xvar="lambda", label=TRUE)
plot(Elastic095, xvar="lambda", label=TRUE)
plot(Elastic100, xvar="lambda", label=TRUE)## 1 Cross-validation to determine λ
set.seed(76543)
Elastic.cv <- cv.glmnet(x, y, alpha = 0.5)
#plot(Elastic.cv)
min(Elastic.cv$cvm)
log(Elastic.cv$lambda.min)
log(Elastic.cv$lambda.1se)
## 2 Obtain new beta coefficients by 1-SE rule
Elastic050 <- glmnet(x, y, alpha = 0.5)
coef(Elastic050, s = Elastic.cv$lambda.1se)
## 3 Predicted values on testing set
Elastic.Pred <- predict(Elastic050, newx = x.test, s = Elastic.cv$lambda.1se)## [1] 0.6108528
## [1] -2.956173
## [1] -1.095499
## 9 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 0.818501440
## lcavol 0.425218017
## lweight 0.258692154
## age .
## lbph 0.030889415
## svi 0.447316008
## lcp 0.015142118
## gleason .
## pgg45 0.001737656
2 real-life examples.
LowBWT <- read.csv("Data/lowbwt.csv")
LowBWT$LOW <- factor(LowBWT$LOW) #categorize variables
LowBWT$RACE <- factor(LowBWT$RACE)
LowBWT$SMOKE <- factor(LowBWT$SMOKE)
LowBWT$HT <- factor(LowBWT$HT)
LowBWT$UI <- factor(LowBWT$UI)
# Fitting continuous outcome, variable "BWT"
x.con = model.matrix(BWT ~ . - 1 - ID - LOW, data = LowBWT)
y.con = LowBWT$BWT
# Fitting binary outcome, variable "LOW"
x.bin = model.matrix(LOW ~ . - 1 - ID - BWT, data = LowBWT)
y.bin = LowBWT$LOWRidge1 <- glmnet(x.con, y.con, alpha = 0)
#plot(Ridge1, xvar="lambda", label=TRUE)
#plot(Ridge1, xvar="dev", label=TRUE)
set.seed(56789)
Ridge1.cv = cv.glmnet(x.con, y.con, alpha = 0)
#plot(Ridge1.cv)
coef(Ridge1, s = Ridge1.cv$lambda.1se)## 11 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 2847.7067856
## AGE 1.6026551
## LWT 0.7244884
## RACE1 50.8523715
## RACE2 -44.7463234
## RACE3 -32.3425866
## SMOKE1 -51.2953483
## PTL -34.7321318
## HT1 -80.3629270
## UI1 -96.8631102
## FTV 4.7682107
LASSO1 <- glmnet(x.con, y.con, alpha = 1)
#plot(LASSO1, xvar="lambda", label=TRUE)
#plot(LASSO1, xvar="dev", label=TRUE)
set.seed(65432)
LASSO1.cv = cv.glmnet(x.con, y.con, alpha = 1)
#plot(LASSO1.cv)
coef(LASSO1, s = LASSO1.cv$lambda.1se)## 11 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 2919.9406349
## AGE .
## LWT 0.2770576
## RACE1 126.8831127
## RACE2 .
## RACE3 .
## SMOKE1 -89.3331248
## PTL .
## HT1 -29.7483236
## UI1 -262.1262526
## FTV .
Ridge2 <- glmnet(x.bin, y.bin, alpha = 0, family = "binomial")
#plot(Ridge2, xvar="lambda", label=TRUE)
#plot(Ridge2, xvar="dev", label=TRUE)
set.seed(56789)
Ridge2.cv = cv.glmnet(x.bin, y.bin, alpha = 0, family = "binomial")
#plot(Ridge2.cv)
coef(Ridge2, s = Ridge2.cv$lambda.1se)## 11 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) -0.560026872
## AGE -0.005036865
## LWT -0.001320565
## RACE1 -0.072877588
## RACE2 0.067682691
## RACE3 0.044508306
## SMOKE1 0.082406149
## PTL 0.093369023
## HT1 0.158414316
## UI1 0.110805589
## FTV -0.012006530
LASSO2 <- glmnet(x.bin, y.bin, alpha = 1, family = "binomial")
#plot(LASSO2, xvar="lambda", label=TRUE)
#plot(LASSO2, xvar="dev", label=TRUE)
set.seed(65432)
LASSO2.cv = cv.glmnet(x.bin, y.bin, alpha = 1, family = "binomial")
#plot(LASSO2.cv)
coef(LASSO2, s = LASSO2.cv$lambda.1se)## 11 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) -0.402500274
## AGE .
## LWT -0.003655509
## RACE1 -0.248026319
## RACE2 .
## RACE3 .
## SMOKE1 0.229056074
## PTL 0.245427433
## HT1 0.465913634
## UI1 0.214356081
## FTV .